import numpy as np #linear algebra
import pandas as pd #data manipulation and analysis
import matplotlib.pyplot as plt #data visualization
import seaborn as sns #data visualization
from scipy.spatial import ConvexHull, convex_hull_plot_2d
import matplotlib.path as mpath
import plotly.express as px
import plotly.graph_objects as go
import textwrap
# read-in dataframe with cluster ids
cluster2_df = pd.read_csv("laborEduClusterData.csv", index_col=0)
cluster2_df.head()
| Bachelor's degree or higher of persons age 25 years+, 2017-2021 | With a disability, under age 65 years, 2017-2021 | Persons without health insurance, under age 65 years | In civilian labor force, total of population age 16 years+, 2017-2021 | In civilian labor force, female of population age 16 years+, 2017-2021 | Total retail sales per capita, 2017 | Mean travel time to work (minutes), workers age 16 years+, 2017-2021 | Median household income (in 2021 dollars), 2017-2021 | Per capita income in past 12 months (in 2021 dollars), 2017-2021 | Persons in poverty | Total employer establishments, 2020 | Total employment, 2020 | Total annual payroll, 2020 ($1,000) | Population per square mile, 2020 | Land area in square miles, 2020 | cluster_id | Banned or not | County Name | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4406.72 | 4434.26 | 2588.95 | 14184.13 | 14018.88 | 9461 | 36.1 | 44467 | 24539 | 5205.44 | 385 | 4572 | 167427 | 47.1 | 583.87 | 0 | 0.0 | Adams County, Ohio |
| 1 | 19418.97 | 12912.09 | 8540.28 | 61917.03 | 58765.26 | 16266 | 19.5 | 55114 | 28671 | 15555.51 | 2286 | 45012 | 1983272 | 253.9 | 402.55 | 0 | 1.0 | Allen County, Ohio |
| 2 | 11614.15 | 5179.28 | 4708.44 | 32592.87 | 29663.17 | 9431 | 24.2 | 58168 | 28992 | 5702.44 | 1034 | 18234 | 701075 | 124.0 | 422.99 | 0 | 0.0 | Ashland County, Ohio |
| 3 | 14697.89 | 12361.80 | 9733.70 | 55190.08 | 51296.60 | 10406 | 25.8 | 49680 | 26777 | 15281.91 | 1806 | 24464 | 913850 | 139.0 | 702.07 | 0 | 0.0 | Ashtabula County, Ohio |
| 4 | 21161.10 | 8005.22 | 5833.26 | 34316.97 | 33386.13 | 11148 | 21.9 | 47061 | 24990 | 12969.70 | 1029 | 13265 | 469095 | 124.0 | 503.64 | 0 | 0.0 | Athens County, Ohio |
from functionsAll import split_dataframe_by_cluster, get_cluster_coords_dict,coords, clusterk_dict,\
cluster0, cluster1, cluster2
# 1
df_list = split_dataframe_by_cluster(cluster2_df, 'cluster_id')
# 2
coords_list = coords(df_list, cluster2_df)
# 3
clusterK_dict = clusterk_dict(df_list, coords_list)
for i in range(len(df_list)):
if len(df_list) ==1:
cluster0_dict = cluster0(clusterK_dict)
elif len(df_list) ==2:
cluster0_dict = cluster0(clusterK_dict)
cluster1_dict = cluster1(clusterK_dict)
elif len(df_list) ==3:
cluster0_dict = cluster0(clusterK_dict)
cluster1_dict = cluster1(clusterK_dict)
cluster2_dict = cluster2(clusterK_dict)
print(f"cluster{i}_dict have been made to dictionary")
cluster0_dict have been made to dictionary cluster1_dict have been made to dictionary cluster2_dict have been made to dictionary
from functionsBanned import filter_banned_counties, get_banned_cluster_coords_dict, bannedCoords, clusterk_dict_banned,\
cluster0Banned, cluster1Banned, cluster2Banned
# 1
banned_counties_df = filter_banned_counties(cluster2_df)
# 2
bannedCoords_list = bannedCoords(banned_counties_df, cluster2_df)
# 3
clusterKBanned_dict = clusterk_dict_banned(banned_counties_df, bannedCoords_list)
Cluster0 had enough banned counties to find non-banned counties in the banned counties convex hull. Cluster2 had enough banned counties to find non-banned counties in the banned counties convex hull.
from countyName import banned_counties_list, countyNames_cluster0, countyNames_cluster1, countyNames_cluster2, merge_dicts
bannedCountiesList=banned_counties_list(cluster2_df)
# Make into function
for i in range(len(banned_counties_df)):
if len(banned_counties_df) ==1:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
allCounties=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
elif len(banned_counties_df) ==2:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
allCounties = merge_dicts(countyName0, countyName1)
elif len(banned_counties_df) ==3:
cluster0_banned_dict = cluster0Banned(clusterKBanned_dict)
cluster1_banned_dict = cluster1Banned(clusterKBanned_dict)
cluster2_banned_dict = cluster2Banned(clusterKBanned_dict)
countyName0=countyNames_cluster0(cluster2_df, cluster0_dict, cluster0_banned_dict, bannedCountiesList)
countyName1=countyNames_cluster1(cluster2_df, cluster1_dict, cluster1_banned_dict, bannedCountiesList)
countyName2=countyNames_cluster0(cluster2_df, cluster2_dict, cluster2_banned_dict, bannedCountiesList)
print(f"cluster{i}_banned_dict have been made to dictionary")
cluster0_banned_dict have been made to dictionary cluster1_banned_dict have been made to dictionary
Anything above here is good
from topVariableFunctions import filtered_var_pairs, categoryCountyList, subcategoryCountyList, commonKeys, freq_var
filt = filtered_var_pairs(allCounties)
categorycountyList = categoryCountyList(filt)
The category with the most counties is 'Mean travel time to work (minutes), workers age 16 years+, 2017-2021' with 26 counties.
subcategorycountyList = subcategoryCountyList(filt)
The subcategory with the most counties is 'Median household income (in 2021 dollars), 2017-2021' with 26 counties.
combinedVars = commonKeys(categorycountyList, subcategorycountyList)
After all variables are combined, we can now take a look at some of the counties that may be at risk the most common variable amoung them
sorted_var_dict = dict(sorted(combinedVars.items(), key=lambda x: len(x[1]), reverse=True))
# Make ranking table of vars.
ranked_listTbl = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(', '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_dict.items())]
topVarTbl = pd.DataFrame(ranked_listTbl)
topVarTbl.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
# try to chnage lengh of cloumn
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'
fig = go.Figure(data=[go.Table(
columnwidth = [50,110,75,700],
header=dict(
values=['Rank', 'Demographic Variable', 'Number of Counties', 'List of Counties'],
line_color='darkslategray',
fill_color=headerColor,
align=['left','center'],
font=dict(color='white', size=12)
),
cells=dict(
values=[topVarTbl.Rank, topVarTbl.Demographic_variable, topVarTbl.Number_of_counties, topVarTbl.List_of_Counties],
line_color='darkslategray',
# 2-D list of colors for alternating rows
fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
align = ['center', 'center', 'center', 'left'],
font = dict(color = 'darkslategray', size = 11)
))
])
fig.show()
Checking to see which counties had the most demographic variables. Then showing a table of what they are
# # Make ranking table of vars.
ranked_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_dict.items())]
newdf2 = pd.DataFrame(ranked_list)
newdf2.columns=['Rank', 'Demographic_variable', 'Number_of_counties', 'List_of_Counties']
# Separate the list of counties into individual rows
newdf2 = newdf2.explode('List_of_Counties')
# Drop the duplicate columns
newdf2 = newdf2.drop_duplicates(subset=['Rank', 'List_of_Counties'])
# Rename the columns for clarity
newdf3 = newdf2.rename(columns={
'Demographic_variable': 'Demographic_Variable',
'Number_of_counties': 'Number_of_Counties',
'List_of_Counties': 'County'
})
# Display the updated data frame
newdf3.head()
| Rank | Demographic_Variable | Number_of_Counties | County | |
|---|---|---|---|---|
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Mercer |
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Morrow |
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Jefferson |
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Tuscarawas |
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Ashland |
# Melting OG data:
melted_df = pd.melt(cluster2_df, id_vars=['County Name'], var_name='Attribute', value_name='Value')
melted_df['County Name'] = melted_df['County Name'].apply(lambda x: x[:-13])
melted_df
| County Name | Attribute | Value | |
|---|---|---|---|
| 0 | Adams | Bachelor's degree or higher of persons age 25 ... | 4406.72 |
| 1 | Allen | Bachelor's degree or higher of persons age 25 ... | 19418.97 |
| 2 | Ashland | Bachelor's degree or higher of persons age 25 ... | 11614.15 |
| 3 | Ashtabula | Bachelor's degree or higher of persons age 25 ... | 14697.89 |
| 4 | Athens | Bachelor's degree or higher of persons age 25 ... | 21161.10 |
| ... | ... | ... | ... |
| 1491 | Wood | Banned or not | 0.00 |
| 1492 | Union | Banned or not | 1.00 |
| 1493 | Wyandot | Banned or not | 0.00 |
| 1494 | Summit | Banned or not | 0.00 |
| 1495 | Van Wert | Banned or not | 0.00 |
1496 rows × 3 columns
tidy_df = pd.merge(newdf3, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
right_on=['County Name', 'Attribute'])
tidy_df.drop(['Attribute','County Name'], axis=1, inplace=True)
tidy_df.head()
| Rank | Demographic_Variable | Number_of_Counties | County | Value | |
|---|---|---|---|---|---|
| 0 | 1 | Median household income (in 2021 dollars), 201... | 34 | Mercer | 68692.0 |
| 1 | 1 | Median household income (in 2021 dollars), 201... | 34 | Morrow | 63411.0 |
| 2 | 1 | Median household income (in 2021 dollars), 201... | 34 | Jefferson | 49211.0 |
| 3 | 1 | Median household income (in 2021 dollars), 201... | 34 | Tuscarawas | 57545.0 |
| 4 | 1 | Median household income (in 2021 dollars), 201... | 34 | Ashland | 58168.0 |
# create bar chart trace
fig = px.bar(tidy_df, x=tidy_df.Demographic_Variable, y=tidy_df.Number_of_Counties, text='Value', color='County',
labels={"Attribute": "Attribute",
"Value": "Attribute Value"
},
title="Default behavior: some text is tiny",)
fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
width=1200, # set the width of the plot to 800 pixels
height=1400, # set the height of the plot to 600 pixels
)
# display the plot
fig.show()
freq_var =freq_var(ranked_list)
# var_freq to dictionary
var_freq_dict = {item[0]: item[1] for i, item in enumerate(freq_var)}
# sorting var_freq_dict dictionary
sorted_var_freq_dict = dict(sorted(var_freq_dict.items(), key=lambda x: len(x[1]), reverse=True))
# Make ranking table of vars.
ranked_var_freqViz = [(i+1, item[0], len(item[1]), ", ".join([textwrap.fill(' '.join(textwrap.wrap(c, width=10)), width=80) for c in item[1]])) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdfViz = pd.DataFrame(ranked_var_freqViz)
top_var_freqdfViz.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
headerColor = 'grey'
rowEvenColor = 'lightgrey'
rowOddColor = 'white'
fig = go.Figure(data=[go.Table(
columnwidth = [40,70,100,600],
header=dict(
values=['Rank', 'County Name', 'Number of Demographic Variables', 'List of Demographics'],
line_color='darkslategray',
fill_color=headerColor,
align=['center','center', 'center', 'left'],
font=dict(color='white', size=12)
),
cells=dict(
values=[top_var_freqdfViz.Rank, top_var_freqdfViz.County, top_var_freqdfViz.Number_of_variables,
top_var_freqdfViz.List_of_variables],
line_color='darkslategray',
# 2-D list of colors for alternating rows
fill_color = [[rowOddColor,rowEvenColor,rowOddColor, rowEvenColor,rowOddColor]*5],
align = ['center', 'center', 'center', 'left'],
font = dict(color = 'darkslategray', size = 11)
))
])
fig.show()
# # Make ranking table of vars.
ranked_vars_list = [(i+1, item[0], len(item[1]), item[1]) for i, item in enumerate(sorted_var_freq_dict.items())]
top_var_freqdf = pd.DataFrame(ranked_vars_list)
top_var_freqdf.columns=['Rank', 'County', 'Number_of_variables', 'List_of_variables']
# Separate the list of counties into individual rows
test2 = top_var_freqdf.explode('List_of_variables')
# Drop the duplicate columns
test2 = test2.drop_duplicates(subset=['Rank', 'List_of_variables'])
# Rename the columns for clarity
test5 = test2.rename(columns={
'List_of_variables': 'Demographic_Variable',
'Number_of_variables': 'Number_of_variables',
'County': 'County'
})
# Display the updated data frame
# test5.head()
tidy_df2 = pd.merge(test5, melted_df[['County Name', 'Attribute', 'Value']], left_on=['County','Demographic_Variable'],
right_on=['County Name', 'Attribute'])
# create bar chart trace
fig = px.bar(tidy_df2, x=tidy_df2.County, color='Demographic_Variable',text='Value',
labels={"Attribute": "Attribute",
"Value": "Attribute Value",
"Demographic_Variable":"Demographic Var."
},
title="Default behavior: some text is tiny",)
fig.update_traces(textposition='inside')
# update the layout to adjust the size of the plot
fig.update_layout(
width=1500, # set the width of the plot to 800 pixels
height=500, # set the height of the plot to 600 pixels
)
# display the plot
fig.show()
## Most common variables
res = sum(sorted_var_freq_dict.values(), [])
mostCommonVar = list(set(res))
mostCommonVar
['Per capita income in past 12 months (in 2021 dollars), 2017-2021', 'In civilian labor force, female of population age 16 years+, 2017-2021', 'Total employment, 2020', 'Population per square mile, 2020', 'Total annual payroll, 2020 ($1,000)', 'Persons in poverty', 'Median household income (in 2021 dollars), 2017-2021', 'Persons without health insurance, under age 65 years', 'In civilian labor force, total of population age 16 years+, 2017-2021', 'Mean travel time to work (minutes), workers age 16 years+, 2017-2021', 'With a disability, under age 65 years, 2017-2021', 'Total employer establishments, 2020', 'Total retail sales per capita, 2017']
mostCommonCounties = top_var_freqdfViz['County'].tolist()
print(mostCommonCounties)
['Ashland', 'Sandusky', 'Huron', 'Fulton', 'Shelby', 'Muskingum', 'Belmont', 'Washington', 'Darke', 'Marion', 'Jefferson', 'Tuscarawas', 'Clinton', 'Guernsey', 'Pickaway', 'Erie', 'Wayne', 'Greene', 'Columbiana', 'Defiance', 'Athens', 'Ross', 'Mercer', 'Lawrence', 'Trumbull', 'Wyandot', 'Van Wert', 'Clark', 'Williams', 'Ashtabula', 'Scioto', 'Wood', 'Warren', 'Licking', 'Ottawa', 'Paulding', 'Preble', 'Henry', 'Richland', 'Champaign', 'Miami', 'Holmes', 'Clermont', 'Delaware', 'Morrow', 'Carroll', 'Hardin', 'Coshocton', 'Brown', 'Lake', 'Portage', 'Fairfield', 'Highland']
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/plotly/datasets/master/minoritymajority.csv',
dtype={"fips": str})
# Filter the data frame to include only Ohio
df_ohio = df[df['STNAME'] == 'Ohio']
# Filter the counties GeoJSON file to include only Ohio counties
counties_ohio = {'type': 'FeatureCollection', 'features': []}
for feature in counties['features']:
if feature['id'][:2] == '39':
counties_ohio['features'].append(feature)
cluster2_df['TempCounty Name'] = cluster2_df['County Name'].apply(lambda x: x[:-6])
# merge the two data frames based on the 'County Name' and 'CTYNAME' columns
merged_df = pd.merge(cluster2_df, df_ohio[['CTYNAME', 'FIPS']], left_on='TempCounty Name', right_on='CTYNAME')
# drop the duplicate 'CTYNAME' column
merged_df.drop('CTYNAME', axis=1, inplace=True)
merged_df.drop('TempCounty Name', axis=1, inplace=True)
# print the merged data frame
merged_df.head()
| Bachelor's degree or higher of persons age 25 years+, 2017-2021 | With a disability, under age 65 years, 2017-2021 | Persons without health insurance, under age 65 years | In civilian labor force, total of population age 16 years+, 2017-2021 | In civilian labor force, female of population age 16 years+, 2017-2021 | Total retail sales per capita, 2017 | Mean travel time to work (minutes), workers age 16 years+, 2017-2021 | Median household income (in 2021 dollars), 2017-2021 | Per capita income in past 12 months (in 2021 dollars), 2017-2021 | Persons in poverty | Total employer establishments, 2020 | Total employment, 2020 | Total annual payroll, 2020 ($1,000) | Population per square mile, 2020 | Land area in square miles, 2020 | cluster_id | Banned or not | County Name | FIPS | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4406.72 | 4434.26 | 2588.95 | 14184.13 | 14018.88 | 9461 | 36.1 | 44467 | 24539 | 5205.44 | 385 | 4572 | 167427 | 47.1 | 583.87 | 0 | 0.0 | Adams County, Ohio | 39001 |
| 1 | 19418.97 | 12912.09 | 8540.28 | 61917.03 | 58765.26 | 16266 | 19.5 | 55114 | 28671 | 15555.51 | 2286 | 45012 | 1983272 | 253.9 | 402.55 | 0 | 1.0 | Allen County, Ohio | 39003 |
| 2 | 11614.15 | 5179.28 | 4708.44 | 32592.87 | 29663.17 | 9431 | 24.2 | 58168 | 28992 | 5702.44 | 1034 | 18234 | 701075 | 124.0 | 422.99 | 0 | 0.0 | Ashland County, Ohio | 39005 |
| 3 | 14697.89 | 12361.80 | 9733.70 | 55190.08 | 51296.60 | 10406 | 25.8 | 49680 | 26777 | 15281.91 | 1806 | 24464 | 913850 | 139.0 | 702.07 | 0 | 0.0 | Ashtabula County, Ohio | 39007 |
| 4 | 21161.10 | 8005.22 | 5833.26 | 34316.97 | 33386.13 | 11148 | 21.9 | 47061 | 24990 | 12969.70 | 1029 | 13265 | 469095 | 124.0 | 503.64 | 0 | 0.0 | Athens County, Ohio | 39009 |
merged_df['TempCounty Name'] = merged_df['County Name'].apply(lambda x: x[:-13])
# Example list of counties to check
counties_to_check = merged_df['TempCounty Name'].tolist()
# Create a list to hold safe counties
safe_counties = []
# Check each county and append to the safe_counties list if not in either of the two lists
for county in counties_to_check:
if county not in mostCommonCounties and county not in bannedCountiesList:
safe_counties.append(county)
# Print the list of safe counties
print(safe_counties)
['Adams', 'Cuyahoga', 'Fayette', 'Franklin', 'Gallia', 'Geauga', 'Hamilton', 'Harrison', 'Hocking', 'Jackson', 'Lorain', 'Lucas', 'Madison', 'Mahoning', 'Meigs', 'Monroe', 'Montgomery', 'Morgan', 'Noble', 'Perry', 'Pike', 'Putnam', 'Stark', 'Vinton', 'Summit']
merged_df['risk'] = merged_df['TempCounty Name'].apply(lambda x: 'At risk' if x in mostCommonCounties else
'Already has Banned' if x in bannedCountiesList else
'Safe' if x in safe_counties else merged_df[merged_df['TempCounty Name'] == x]['risk'].values[0])
merged_df.head()
| Bachelor's degree or higher of persons age 25 years+, 2017-2021 | With a disability, under age 65 years, 2017-2021 | Persons without health insurance, under age 65 years | In civilian labor force, total of population age 16 years+, 2017-2021 | In civilian labor force, female of population age 16 years+, 2017-2021 | Total retail sales per capita, 2017 | Mean travel time to work (minutes), workers age 16 years+, 2017-2021 | Median household income (in 2021 dollars), 2017-2021 | Per capita income in past 12 months (in 2021 dollars), 2017-2021 | Persons in poverty | ... | Total employment, 2020 | Total annual payroll, 2020 ($1,000) | Population per square mile, 2020 | Land area in square miles, 2020 | cluster_id | Banned or not | County Name | FIPS | TempCounty Name | risk | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 4406.72 | 4434.26 | 2588.95 | 14184.13 | 14018.88 | 9461 | 36.1 | 44467 | 24539 | 5205.44 | ... | 4572 | 167427 | 47.1 | 583.87 | 0 | 0.0 | Adams County, Ohio | 39001 | Adams | Safe |
| 1 | 19418.97 | 12912.09 | 8540.28 | 61917.03 | 58765.26 | 16266 | 19.5 | 55114 | 28671 | 15555.51 | ... | 45012 | 1983272 | 253.9 | 402.55 | 0 | 1.0 | Allen County, Ohio | 39003 | Allen | Already has Banned |
| 2 | 11614.15 | 5179.28 | 4708.44 | 32592.87 | 29663.17 | 9431 | 24.2 | 58168 | 28992 | 5702.44 | ... | 18234 | 701075 | 124.0 | 422.99 | 0 | 0.0 | Ashland County, Ohio | 39005 | Ashland | At risk |
| 3 | 14697.89 | 12361.80 | 9733.70 | 55190.08 | 51296.60 | 10406 | 25.8 | 49680 | 26777 | 15281.91 | ... | 24464 | 913850 | 139.0 | 702.07 | 0 | 0.0 | Ashtabula County, Ohio | 39007 | Ashtabula | At risk |
| 4 | 21161.10 | 8005.22 | 5833.26 | 34316.97 | 33386.13 | 11148 | 21.9 | 47061 | 24990 | 12969.70 | ... | 13265 | 469095 | 124.0 | 503.64 | 0 | 0.0 | Athens County, Ohio | 39009 | Athens | At risk |
5 rows × 21 columns
# One layer
import plotly.express as px
fig = px.choropleth(merged_df, geojson=counties_ohio, locations='FIPS', color='risk',
color_continuous_scale="Viridis",
range_color=(0, 2),
scope="usa",
hover_data=["County Name","Bachelor's degree or higher of persons age 25 years+, 2017-2021",
'With a disability, under age 65 years, 2017-2021'],
labels={'risk':'Risk Level'})
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()